---
title: "Gathering Data for the SPI Literature Review Paper"
author: "Brian Stacy"
date: "12/20/2024"
output: html_document
---

This file will gather data, mostly from the World Bank's World Development Indicators combine them with SPI data and data from other indicators of statistical performance, and save the resulting data as CSVs for use in the file that produces the final tables and figures.

Note: By re-running this code, you will overwrite all data used to produce the tables and figures for the paper, and thus the results may not replicate the final results in the paper.

```{r setup, include=FALSE}
knitr::opts_chunk$set(
	echo = TRUE,
	message = FALSE,
	warning = FALSE,
	fig.height=6,
	fig.width = 8,
	dev=c("png","tiff")
)
library(data.table)
library(tidyverse)
library(flextable)
library(here)
library(haven)
library(wbstats)
library(readxl)
library(zoo)
#devtools::install_github("worldbank/pipr")
library(pipr) #World Bank PIP package

dir <- here()
raw_dir <- paste0(dir, "/01_raw_data/")
output_dir <- paste(dir, '/03_output/', sep="/")
#weights (either unity (1) or population)
wgt <- 1
end_date=2023
```

## Poverty Surveys

```{r, eval=FALSE}
#| label: pipdata

#read in PIP data for Figure 1 from the paper.
#new pipr package already includes survey coverage and comparability measures
pip_df <- get_stats()

# get mean consumption/income per day as of latest year (mean)
welfare_df <- pip_df %>%
  filter(survey_coverage %in% c('national')) %>% #keep just the national surveys
  filter(reporting_level %in% c('national')) %>% #keep just the national survey
  rename(iso3c=country_code,
         mean_welfare=mean) %>%
  group_by(iso3c) %>%
  filter(year==max(year)) %>%
  #keep consumption survey for Poland, Haiti, Angola, PHL
  filter(!(iso3c=="POL" & welfare_type=='income'))    %>%
  filter(!(iso3c=="HTI" & welfare_type=='income'))    %>%
  filter(!(iso3c=="AGO" & welfare_type=='income'))    %>%
  filter(!(iso3c=="PHL" & welfare_type=='income'))    %>%
  select(iso3c, mean_welfare)  

#reshape the data
pip_df <- pip_df %>%
  filter(survey_coverage %in% c('national')) %>% #keep just the national surveys
  filter(reporting_level %in% c('national')) %>% #keep just the national surveys
  filter(between(year, 1982, end_date)) %>% #keep data from 1982 to end_date
  #aggregate totals by country
  rename(iso3c=country_code) %>%
  group_by(iso3c) %>%
  summarise(
    number_surveys=n(),
    avg_mean_welfare = mean(mean)
  ) #get counts of the number of surveys by country 

#add welfare
pip_df <- pip_df %>%
  left_join(welfare_df)

#save
write_excel_csv(pip_df, paste0(output_dir, "/Figure_1_data.csv"))

```


```{r spidataload, echo=TRUE}
#add in population
pop_df <- wbstats::wb_data(country="all",
             indicator='SP.POP.TOTL',
             start_date=2004,
             end_date=end_date) %>%
  mutate(date=as.numeric(date)) %>%
  mutate(population=SP.POP.TOTL) %>%
  select(country, date, population) 
#import data
spi_index_df <- read_csv(paste(raw_dir, 'SPI_index.csv', sep="/")) %>%
    left_join(pop_df) 


spi_df_final <- read_csv( file = paste(raw_dir, 'SPI_data.csv', sep="/")) %>%
  left_join(pop_df) 
#metadata 
metadata <- data.frame(
  series = c('SPI.INDEX', 'SPI.INDEX.PIL1', 'SPI.INDEX.PIL2', 'SPI.INDEX.PIL3', 'SPI.INDEX.PIL4', 'SPI.INDEX.PIL5'),
  indicator_name = c('SPI Overall Score', 'Pillar 1: Data Use', 'Pillar 2: Data Services', 'Pillar 3: Data Products', 'Pillar 4: Data Sources', 'Pillar 5: Data Infrastructure')
)
#get list of country info
country_list <- wbstats::wb_countries()

write_excel_csv(country_list, paste0(raw_dir, "country_metadata.csv"))

SPI_end_date <- spi_index_df %>%
  filter(date==end_date) %>%
  filter(!is.na(SPI.INDEX) & !is.na(weights)) %>%
  mutate(ISO_A3_EH=iso3c) 

iso_codes <- read_csv(paste0(raw_dir, "/metadata/iso_codes.csv")) 


SPI <- spi_index_df
#read in TopoJSON from World Bank
#countries <- geojsonio::geojson_read("WB_countries_Admin0_lowres.geojson",
#                                     what = "sp")
```


```{r}
span <- c(2004:end_date)

#produce an empty dataset with the right number of rows (country X year observations)
spi_df_empty <- bind_rows(replicate(length(span), wbstats::wbcountries(), simplify = FALSE), .id='date') %>%
  mutate(date=as.numeric(date)+span[1]-1) %>%
  filter(region!="Aggregates") # take out the aggregates (LAC, SAR, etc)

```


## Correlates using Latest Year

```{r correlates}


start_date=2010

correlates_df <- wb_data(country="all", 
              indicator=c('SI.POV.DDAY', 'SI.POV.GINI'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('SI.POV.DDAY', 'SI.POV.GINI'), .direction = 'downup') %>%
  filter(date==end_date)


ge_df <- wb_data(country="all", 
              indicator=c('GE.EST'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('GE.EST'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, date, GE.EST)

gdp_df <- wb_data(country="all", 
              indicator=c('NY.GDP.PCAP.KD'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('NY.GDP.PCAP.KD'), .direction = 'downup') %>%
  filter(date==end_date) %>%  select(iso3c, date, NY.GDP.PCAP.KD)

hci_df <- wb_data(country="all", 
              indicator=c('HD.HCI.OVRL'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('HD.HCI.OVRL'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, HD.HCI.OVRL)

undernourish_df <- wb_data(country="all", 
              indicator=c('SN.ITK.DEFC.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('SN.ITK.DEFC.ZS'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, date, SN.ITK.DEFC.ZS)

mmt_df <-  wb_data(country="all", 
              indicator=c('SH.STA.MMRT'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.STA.MMRT'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c,  SH.STA.MMRT)


lpov_df <-  wb_data(country="all", 
              indicator=c('SE.LPV.PRIM'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('SE.LPV.PRIM'), .direction = 'downup') %>%
  filter(date==end_date)  %>%
  select(iso3c, date, SE.LPV.PRIM)  


wbl_df <- wb_data(country="all", 
              indicator=c('SG.LAW.INDX'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('SG.LAW.INDX'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, date, SG.LAW.INDX)

safely_df <- wb_data(country="all", 
              indicator=c('SH.H2O.SMDW.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.H2O.SMDW.ZS'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, date, SH.H2O.SMDW.ZS)
  
elect_df <- wb_data(country="all", 
              indicator=c('EG.ELC.ACCS.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('EG.ELC.ACCS.ZS'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, date, EG.ELC.ACCS.ZS)
  
manuf_df <- wb_data(country="all", 
              indicator=c('NV.IND.MANF.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('NV.IND.MANF.ZS'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c, date, NV.IND.MANF.ZS)


slums_df <- wb_data(country="all", 
              indicator=c('EN.POP.SLUM.UR.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.POP.SLUM.UR.ZS'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c,  EN.POP.SLUM.UR.ZS)

ff_sub_df <- read_csv(paste0(raw_dir, "/Subsidies.csv")) %>%
  rename(iso3c=code)
  

ghg_df <- wb_data(country="all", 
              indicator=c('EN.GHG.ALL.MT.CE.AR5'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.GHG.ALL.MT.CE.AR5'), .direction = 'downup') %>%
  filter(date==end_date) %>%
  select(iso3c,  EN.GHG.ALL.MT.CE.AR5)

protected_df <- wb_data(country="all", 
              indicator=c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  fill(c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'), .direction = 'downup') %>%

  filter(date==end_date) %>%
  select(iso3c, date, ER.MRN.PTMR.ZS,ER.LND.PTLD.ZS)

debt_df <- wb_data(country="all", 
              indicator=c('DT.TDS.DECT.EX.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  fill(c('DT.TDS.DECT.EX.ZS'), .direction = 'downup') %>%

  filter(date==end_date) %>%
  select(iso3c, date, DT.TDS.DECT.EX.ZS)

#bring in Sachs SDG index
#https://dashboards.sdgindex.org/explorer
# accessed on 2023-07-20

sdg_index <- read_excel(path=paste0(raw_dir, "/SDR2024-data.xlsx"),
                        sheet='Backdated SDG Index'
) %>%
  transmute(iso3c=id,
         date=year,
         sdg_index_score=`SDG Index Score`) %>%
  group_by(iso3c) %>%
  fill(c('sdg_index_score'), .direction = 'downup') %>%
  filter(date>=end_date) %>% #take 5 latest years
  summarise(across(c('sdg_index_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, sdg_index_score)


#the UN’s HDI (https://hdr.undp.org/data-center/human-development-index#/indicies/HDI).  Downloaded Aug 24, 2023

hdi_df <- read_csv(paste0(raw_dir, 'HDR21-22_Composite_indices_complete_time_series.csv')) %>%
  rename(iso3c=iso3) %>%
  select(iso3c, hdi_1990:hdi_2021) %>%
  pivot_longer(
    cols=c('hdi_1990':'hdi_2021'),
    names_to='index',
    values_to='hdi_value'
  ) %>%
  mutate(
    date=as.numeric(str_sub(index,5,8))
  ) %>%
  group_by(iso3c) %>%
  fill(c('hdi_value'), .direction = 'downup') %>%
  filter(date>=2021) %>% #take 5 latest years
  summarise(across(c('hdi_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, hdi_value)
              
#Yale’s Environmental Performance Index (https://epi.yale.edu/).  Downloaded Aug 24, 2023
epi_2024_df <- read_csv(paste0(raw_dir, 'epi2024results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2024) %>%
  select(iso3c, date, env_perform_index)

epi_2022_df <- read_csv(paste0(raw_dir, 'epi2022results05302022.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2022) %>%
  select(iso3c, date, env_perform_index)

epi_2020_df <- read_csv(paste0(raw_dir, 'epi2020results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2020) %>%
  select(iso3c, date, env_perform_index)

epi_2018_df <- read_csv(paste0(raw_dir, 'epi2018results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.current) %>%
  mutate(date=2018) %>%
  select(iso3c, date, env_perform_index)

epi_df <- bind_rows(epi_2024_df,epi_2022_df, epi_2020_df, epi_2018_df) %>%
  group_by(iso3c) %>%
  fill(c('env_perform_index'), .direction = 'downup') %>%
  filter(date>=2023) %>% #take 5 latest years
  summarise(across(c('env_perform_index'), mean, na.rm=TRUE)) %>%
  select(iso3c, env_perform_index)

# FAO food security index and prevalence of undernourishment (https://www.fao.org/faostat/en/#data/FS)
fao_stats_df <- read_csv(paste0(raw_dir, "FAOSTAT_data_en_1-6-2025.csv")) %>%
    mutate(geoAreaCode=as.numeric(`Area Code (M49)`)) %>% #convert to numeric
    left_join(
        read_csv(paste0(raw_dir, "/metadata/iso_codes.csv") ) %>%
            select(geoAreaCode, iso3c)
    ) %>%
    filter(Item %in% c(
        "Prevalence of severe food insecurity in the total population (percent) (3-year average)",
        "Prevalence of undernourishment (percent) (3-year average)"
    )) %>%
    mutate(date=as.numeric(str_sub(`Year Code`,1,4))+1)  %>% #add 1 because it is the midpoint of 3 year avg
    select(iso3c, date, Item, Value) %>% #keep only relevant columns
    filter(!is.na(iso3c)) %>%
    #pivot wider
    pivot_wider(
        names_from=Item,
        values_from=Value,
    ) %>%
    #shorten names
    rename(
        undernourish=`Prevalence of undernourishment (percent) (3-year average)`,
        severe_food_insecurity=`Prevalence of severe food insecurity in the total population (percent) (3-year average)`
    ) %>%
    #keep only numeric values in Value using str_remove_all
    mutate(
        undernourish=as.numeric(str_remove_all(undernourish, "[^0-9.]")),
        severe_food_insecurity=as.numeric(str_remove_all(severe_food_insecurity, "[^0-9.]"))
    ) %>%
  group_by(iso3c) %>%
  fill(c('undernourish','severe_food_insecurity'), .direction = 'downup') %>%
  filter(date>=end_date) %>% #take 5 latest years
  summarise(across(c('undernourish','severe_food_insecurity'), mean, na.rm=TRUE)) %>%
  select(iso3c, undernourish, severe_food_insecurity) 



##Harvard’s Economic complexity index (https://atlas.cid.harvard.edu/rankings).  Downloaded Aug 24, 2023
ec_complex_df <- read_dta(paste0(raw_dir, "Country Complexity Rankings 1995 - 2022.dta")) %>%
  rename(geoAreaCode=country_id,
         eci_value=hs_eci,
         date=year) %>%
  left_join(iso_codes) %>%
  group_by(iso3c) %>%
  filter(date>=2021) %>% #take 5 latest years
  summarise(across(c('eci_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, eci_value)


#World press freedom index (https://rsf.org/en/index).  Downloaded Aug 24, 2023
press_free_df <- bind_rows(
  read_delim(paste0(raw_dir, "press_freedom_2023.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2022.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2021.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2020.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2019.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2018.csv"), delim=";")
) %>%
  mutate(Score=if_else(is.na(Score),`Score N`, Score )) %>%
  mutate(Score=case_when(
    Score<100 ~ Score,
    between(Score,100,999) ~ Score/10,
    TRUE ~ Score/100
  )) %>% #fix issue with comma decimal separator  
  transmute(
    iso3c=ISO,
    press_free_score=Score,
    date=`Year (N)`
  ) %>%
  group_by(iso3c) %>%
  fill(c('press_free_score'), .direction = 'downup') %>%
  filter(date>=2023) %>% #take 5 latest years
  summarise(across(c('press_free_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, press_free_score)

#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
bli_df <- read_csv(paste0(raw_dir, 'BLI_20230824.csv')) %>%
  rename(iso3c=LOCATION) %>%
  mutate(
    category=case_when(
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities", "Rooms per person") ~ "Housing",
      Indicator %in% c("Household net wealth", "Household net adjusted disposable income") ~ "Income",
      Indicator %in% c('Labour market insecurity', 'Personal earnings', 'Long-term unemployment rate', 'Employment rate') ~ "Jobs",
      Indicator %in% c('Quality of support network') ~ "Community",
      Indicator %in% c('Years in education','Student skills','Educational attainment') ~ "Education",
      Indicator %in% c('Water quality', "Air pollution") ~ "Environment",
      Indicator %in% c('Stakeholder engagement for developing regulations', 'Voter turnout') ~ "Civic engagement",
      Indicator %in% c('Self-reported health', 'Life expectancy') ~ "Health",
      Indicator %in% c('Life satisfaction') ~ "Life satisfaction",
      Indicator %in% c('Homicide rate', 'Feeling safe walking alone at night') ~ "Safety",
      Indicator %in% c('Time devoted to leisure and personal care','Employees working very long hours') ~ "Work-Life Balance"
      
    )
  ) %>%
  group_by(Indicator) %>%
  mutate(zscore=(Value-mean(Value, na.rm=TRUE))/sd(Value, na.rm=TRUE)) %>%
  mutate(
    score=case_when(#give 1 to a best perfomer and 0 to worst
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities",
                       'Labour market insecurity','Long-term unemployment rate',
                       "Air pollution", 'Homicide rate', 'Employees working very long hours') ~
        1-(Value-min(Value))/(max(Value)-min(Value)),
      TRUE ~ 
        (Value-min(Value))/(max(Value)-min(Value))
    )
  ) %>%
  #zscore all indicators
  group_by(iso3c,category  ) %>%
  #combine indicators with equal weight
  summarise(better_life_subindex=mean(score, na.rm=TRUE)) %>%
  group_by(iso3c) %>%
  summarise(better_life_index=sum(better_life_subindex))

#add legatum prosperity index
# https://www.prosperity.com/
# downloaded on 2024-12-20
legatum_prosperity_df <- read_csv(paste0(raw_dir, 'legatum_prosperity_index.csv')) %>%
  janitor::clean_names() %>%
  transmute(
    iso3c=iso3c,
    date=end_date,
    legatum_health_index=health
  ) %>%
  select(iso3c, legatum_health_index)

# add Global Health Security Index (GHSI) overall score
ghsi_df <- read_csv(paste0(raw_dir, '2021-GHS-Index-April-2022.csv')) %>%
  janitor::clean_names() %>%

  transmute(
    iso3c=iso3c,
    date=year,
    ghsindex=overall_score
  ) %>%
  select(iso3c, date, ghsindex) %>%
  group_by(iso3c) %>%
  filter(date>=2021) %>% #take 5 latest years
  summarise(across(c('ghsindex'), mean, na.rm=TRUE)) %>%
  select(iso3c, ghsindex)

#bring it all together
correlates_df_end_date <- correlates_df %>%
  left_join(ge_df) %>%
  left_join(gdp_df) %>%
  left_join(hci_df) %>%
  left_join(undernourish_df) %>%
  left_join(mmt_df) %>%
  left_join(lpov_df) %>%  left_join(bli_df) %>%       left_join(fao_stats_df) %>%
  
  left_join(wbl_df) %>%
  left_join(safely_df) %>%
  left_join(elect_df) %>%
  left_join(manuf_df) %>%
  left_join(slums_df) %>%
  left_join(ff_sub_df) %>%
  left_join(ghg_df) %>%
  left_join(protected_df) %>%
  left_join(debt_df) %>%
  left_join(sdg_index) %>%
  left_join(hdi_df) %>%
  left_join(epi_df) %>%
  left_join(ec_complex_df) %>%
  left_join(press_free_df) %>%
  left_join(bli_df) %>%       left_join(fao_stats_df) %>%
  left_join(legatum_prosperity_df) %>% left_join(ghsi_df) %>%
  filter(region!="Aggregates") %>%
  select(iso3c, c('SI.POV.DDAY', 'SI.POV.GINI','GE.EST','NY.GDP.PCAP.KD','HD.HCI.OVRL','SN.ITK.DEFC.ZS','SH.STA.MMRT','SE.LPV.PRIM','SG.LAW.INDX','SH.H2O.SMDW.ZS',"EG.ELC.ACCS.ZS",'NV.IND.MANF.ZS','EN.POP.SLUM.UR.ZS','subsidies','EN.GHG.ALL.MT.CE.AR5','ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS','DT.TDS.DECT.EX.ZS', 'sdg_index_score', 'hdi_value', 'env_perform_index', 'eci_value', 'press_free_score', 'better_life_index',                   'undernourish', 'severe_food_insecurity', 'legatum_health_index', 'ghsindex')) 


sdgs <- c('SI.POV.DDAY','SN.ITK.DEFC.ZS','SH.STA.MMRT','SE.LPV.PRIM','SG.LAW.INDX',
          'SH.H2O.SMDW.ZS','EG.ELC.ACCS.ZS','NY.GDP.PCAP.KD','NV.IND.MANF.ZS','SI.POV.GINI',
          'EN.POP.SLUM.UR.ZS','subsidies','EN.GHG.ALL.MT.CE.AR5','ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'
          ,'GE.EST','DT.TDS.DECT.EX.ZS', 'sdg_index_score')

sdg_names <- c('SDG 1: Extreme Poverty','SDG 2: Undernourishment','SDG 3: Maternal Mortality','SDG 4: Learning Poverty','SDG 5: Women, Business, Law Index',
          'SDG 6: Safely Managed Water','SDG 7: Access to Electricity','SDG 8: GDP per capita (2015 constant $)','SDG 9: Manufacturing value added (% of GDP)','SDG 10: Gini Index',
          'SDG 11: Population in Slums','SDG 12: Fossil Fuel Subsidies (% of GDP)','SDG 13: Greenhouse Gas Emissions','SDG 14: Marine protected areas','SDG 15: Terrestrial Protected Areas','SDG 16: Government Effectiveness','SDG 17: Total Debt Service', 'SDR: SDG Index Overall Score')
```

Markdown table with variable and description in correlates_df
| Variable | Description |
| -- | -- |
| SI.POV.DDAY | Proportion of population living below the national poverty line |
| SI.POV.GINI | Gini index |
| GE.EST | Government Effectiveness |
| NY.GDP.PCAP.KD | GDP per capita (2015 constant $) |
| HD.HCI.OVRL | Human Capital Index |
| SN.ITK.DEFC.ZS | Prevalence of undernourishment |
| SH.STA.MMRT | Maternal Mortality Ratio |
| SE.LPV.PRIM | Learning Poverty |
| SG.LAW.IND | Women, Business, Law Index |
| SH.H2O.SMDW.ZS | Safely Managed Water |
| EG.ELC.ACCS.ZS | Access to Electricity |
| NV.IND.MANF.ZS | Manufacturing value added (% of GDP) |
| EN.POP.SLUM.UR.ZS | Population in Slums |
| subsidies | Fossil Fuel Subsidies (% of GDP) |
| EN.GHG.ALL.MT.CE.AR5 | Greenhouse Gas Emissions |
| ER.MRN.PTMR.ZS | Marine protected areas |
| ER.LND.PTLD.ZS | Terrestrial Protected Areas |
| DT.TDS.DECT.EX.ZS | Total Debt Service |
| sdg_index_score | SDG Index Overall Score |
| hdi_value | Human Development Index |
| env_perform_index | Environmental Performance Index |
| eci_value | Economic Complexity Index |
| press_free_score | Press Freedom Index |
| better_life_index | OECD Better Life Index |



```{r comparison, include=FALSE}



#pull SCI values

sci_df <- read_excel(paste0(raw_dir, "Statistical_Capacity_Indicators.xlsx")) %>%
  mutate(YR2004=as.numeric(YR2004),
         iso3c=`Country Code`) %>%
  filter(`Series Code`=="IQ.SCI.OVRL") %>%
  select(-`Country Code`, -`Series Code`) %>%
  pivot_longer(
    cols=c(YR2004:YR2020),
    names_to='YR',
    values_to='SCI'
  ) %>%
  mutate(date=str_remove_all(YR,"YR"),
         date=as.numeric(date)) %>%
  select(iso3c, date, SCI)
  
  
  

# onvert it to a data frame.
sci_df <- sci_df %>%
  left_join(spi_df_empty) %>% #add on country metadata
  filter(!is.na(income)) %>%
  select(iso3c, date, SCI  ) %>%
  group_by(date) %>%
  arrange(-SCI) %>%
  mutate(SCI_rank=rank(-SCI),
         SCI_rank=if_else(is.na(SCI),as.numeric(NA),SCI_rank)) 
  
#write to csv
write_excel_csv(sci_df, paste0(output_dir, "/SCI_formatted_data.csv"))


#read in ODIN data
  for (i in c(2015,2016,2017,2018,2020,2022)) {
   temp <- read_csv(paste(raw_dir, '/','ODIN_',i,'.csv', sep=""))  %>%
        as_tibble(.name_repair = 'universal') %>%
        mutate(date=i) %>%
        filter(Data.categories=='All Categories') %>%
        select(-Year)


    assign(paste('odin_df',i,sep="_"), temp)
  }

    #bind different years together
odin_df <- bind_rows(odin_df_2015, odin_df_2016, odin_df_2017, odin_df_2018, odin_df_2020, odin_df_2022)


odin_df <- odin_df %>%
    select(Country.Code, date, Overall.score) %>%
    rename(iso3c=Country.Code,
           ODIN_score=Overall.score) %>%
    group_by(date) %>%
    arrange(-ODIN_score) %>%
    mutate(ODIN_rank=rank(-ODIN_score, na.last='NA')) %>% 
  right_join(spi_df_empty) %>%
  arrange(country, date) %>%
  group_by(country) %>%
  mutate(across(starts_with("ODIN"), na.locf, na.rm=FALSE)) %>%
  left_join(wbstats::wb_countries()) %>%
  select(iso3c, date, starts_with("ODIN"))

#write to csv
write_excel_csv(odin_df, paste0(output_dir, "/ODIN_formatted_data.csv"))

#read in the ODB
odb_df_2017 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2017-Rankings') %>%
  transmute(
    iso3c=ISO3,
    odb=`ODB-Score`
  )


odb_df_2016_info <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Rankings') %>%
  select(ISO3, Country) 

odb_df_2016 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Absolute-Recalculated') %>%
  left_join(odb_df_2016_info) %>%
  transmute(
    iso3c=ISO3,
    odb_2016=`ODB Score (0-100)`
  )

odb_df_2015 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2015-Rankings')
odb_df_2014 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2014-Rankings')
odb_df_2013 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2013-Rankings')

odb_df <- bind_rows(
  (odb_df_2017 %>% mutate(date=2017)),
  (odb_df_2016 %>% mutate(date=2016, odb=odb_2016)),
  (odb_df_2015 %>% mutate(date=2015, iso3c=ISO3, odb=`ODB-Score-Scaled`)),
  (odb_df_2014 %>% mutate(date=2014, iso3c=ISO3, odb=`ODB-Score-Scaled`)),
  (odb_df_2013 %>% mutate(date=2013, iso3c=ISO3, odb=`ODB-Score-Scaled`)),
  ) %>%
  select(iso3c, date, odb) %>%
  left_join(country_list)

#write to csv
write_excel_csv(odb_df, paste0(output_dir, "/ODB_formatted_data.csv"))

#read in the GDB
gdb_df <- read_csv(paste0(raw_dir, "/global-data-barometer-topic--all-countries.csv")) %>%
  transmute(iso3c=iso,
            gdb=`GDB 2021 score`)

gdb_reg_df <- gdb_df %>% 
  mutate(date=2021) %>%
  select(iso3c, date, gdb) %>%
  left_join(country_list)

#write to csv
write_excel_csv(gdb_reg_df, paste0(output_dir, "/GDB_formatted_data.csv"))

#IIAG
iiag_df <- read_csv(paste0(raw_dir, "/2022 IIAG_Scores.csv")) %>%
  filter(Year==2022) %>%
  transmute(
    iso2c=Country_ISO,
    iiag_overall=`OVERALL GOVERNANCE`,
    iiag_stats=`Capacity of the Statistical System`
  ) %>%
  left_join(country_list) %>%
  select(iso3c,iiag_overall,iiag_stats )
  
#write to csv
write_excel_csv(iiag_df, paste0(output_dir, "/IIAG_formatted_data.csv"))


#create one database
comparison_df <- spi_index_df %>%
  ungroup() %>%
  filter(date ==end_date) %>%
  arrange(-SPI.INDEX) %>%
  mutate(across(starts_with('SPI.INDEX'),~1*.),
         across(starts_with('SPI.INDEX'),round,1)) %>%
  select(country, iso3c, date, income, region, SPI.INDEX) %>%
  left_join(sci_df %>% filter(date==2020) %>% ungroup() %>% select(-date)) %>%
  left_join(odin_df) %>%
  left_join(odb_df_2017) %>%
  left_join(odb_df_2016) %>%
  mutate(odb=if_else(is.na(odb),odb_2016,odb)) %>%
  left_join(gdb_df) %>%
  left_join(iiag_df) %>%
  left_join(correlates_df_end_date)

#save to output
write_excel_csv(comparison_df, paste0(output_dir, "/SPI_Index_SDG_comparisons_data.csv"))


```

Markdown table with variable and description in comparison_df
| Variable | Description |
| -- | -- |
| SPI.INDEX | Statistical Performance Indicators Overall Score |
| SCI | Statistical Capacity Indicator Index |
| ODIN_score | Open Data Index |
| odb | Open Data Barometer |
| gdb | Global Data Barometer |



### Do the same thing, but instead of latest year of data available take the average of 5 preceding years.


```{r correlates5avg}

#instead of the latest value, take the 5 year average

start_date=2010

correlates_df <- wb_data(country="all", 
              indicator=c('SI.POV.DDAY', 'SI.POV.GINI'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  group_by(iso3c, country, region, income, lending) %>%
  fill(c('SI.POV.DDAY', 'SI.POV.GINI'), .direction = 'downup') %>%
  filter(date>=(end_date-4)) %>% #take 5 latest years
  summarise(across(c('SI.POV.DDAY', 'SI.POV.GINI'), mean, na.rm=TRUE))  %>%
  mutate(date=end_date)


ge_df <- wb_data(country="all", 
              indicator=c('GE.EST'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('GE.EST'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('GE.EST'), mean, na.rm=TRUE)) %>%
  select(iso3c, GE.EST)

gdp_df <- wb_data(country="all", 
              indicator=c('NY.GDP.PCAP.KD'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('NY.GDP.PCAP.KD'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('NY.GDP.PCAP.KD'), mean, na.rm=TRUE)) %>%
  select(iso3c, NY.GDP.PCAP.KD)

hci_df <- wb_data(country="all", 
              indicator=c('HD.HCI.OVRL'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('HD.HCI.OVRL'), .direction = 'downup') %>%
   filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('HD.HCI.OVRL'), mean, na.rm=TRUE))  %>%
  select(iso3c, HD.HCI.OVRL)

undernourish_df <- wb_data(country="all", 
              indicator=c('SN.ITK.DEFC.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SN.ITK.DEFC.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('SN.ITK.DEFC.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, SN.ITK.DEFC.ZS)

mmt_df <-  wb_data(country="all", 
              indicator=c('SH.STA.MMRT'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.STA.MMRT'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('SH.STA.MMRT'), mean, na.rm=TRUE)) %>%
  select(iso3c,  SH.STA.MMRT)


lpov_df <-  wb_data(country="all", 
              indicator=c('SE.LPV.PRIM'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SE.LPV.PRIM'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('SE.LPV.PRIM'), mean, na.rm=TRUE))  %>%
  select(iso3c, SE.LPV.PRIM)  


wbl_df <- wb_data(country="all", 
              indicator=c('SG.LAW.INDX'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SG.LAW.INDX'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('SG.LAW.INDX'), mean, na.rm=TRUE)) %>%
  select(iso3c,  SG.LAW.INDX)

safely_df <- wb_data(country="all", 
              indicator=c('SH.H2O.SMDW.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.H2O.SMDW.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('SH.H2O.SMDW.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, SH.H2O.SMDW.ZS)
  
elect_df <- wb_data(country="all", 
              indicator=c('EG.ELC.ACCS.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EG.ELC.ACCS.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('EG.ELC.ACCS.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, EG.ELC.ACCS.ZS)
  
manuf_df <- wb_data(country="all", 
              indicator=c('NV.IND.MANF.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('NV.IND.MANF.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('NV.IND.MANF.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, NV.IND.MANF.ZS)


slums_df <- wb_data(country="all", 
              indicator=c('EN.POP.SLUM.UR.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.POP.SLUM.UR.ZS'), .direction = 'downup') %>%
    filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('EN.POP.SLUM.UR.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c,  EN.POP.SLUM.UR.ZS)

ff_sub_df <- read_csv(paste0(raw_dir, "/Subsidies.csv")) %>%
  rename(iso3c=code)
  

ghg_df <- wb_data(country="all", 
              indicator=c('EN.GHG.ALL.MT.CE.AR5'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.GHG.ALL.MT.CE.AR5'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('EN.GHG.ALL.MT.CE.AR5'), mean, na.rm=TRUE)) %>%
  select(iso3c,  EN.GHG.ALL.MT.CE.AR5)

protected_df <- wb_data(country="all", 
              indicator=c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  fill(c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('ER.MRN.PTMR.ZS', 'ER.LND.PTLD.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, ER.MRN.PTMR.ZS,ER.LND.PTLD.ZS)

debt_df <- wb_data(country="all", 
              indicator=c('DT.TDS.DECT.EX.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  fill(c('DT.TDS.DECT.EX.ZS'), .direction = 'downup') %>%

  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('DT.TDS.DECT.EX.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, DT.TDS.DECT.EX.ZS)

#bring in Sachs SDG index
#https://dashboards.sdgindex.org/explorer
# accessed on 2023-07-20

sdg_index <- read_excel(path=paste0(raw_dir, "/SDR2024-data.xlsx"),
                        sheet='Backdated SDG Index'
) %>%
  transmute(iso3c=`id`,
         date=year,
         sdg_index_score=`SDG Index Score`) %>%
  group_by(iso3c) %>%
  fill(c('sdg_index_score'), .direction = 'downup') %>%
  filter(date>=end_date-4) %>% #take 5 latest years
  summarise(across(c('sdg_index_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, sdg_index_score)


#the UN’s HDI (https://hdr.undp.org/data-center/human-development-index#/indicies/HDI).  Downloaded Aug 24, 2023

hdi_df <- read_csv(paste0(raw_dir, 'HDR21-22_Composite_indices_complete_time_series.csv')) %>%
  rename(iso3c=iso3) %>%
  select(iso3c, hdi_1990:hdi_2021) %>%
  pivot_longer(
    cols=c('hdi_1990':'hdi_2021'),
    names_to='index',
    values_to='hdi_value'
  ) %>%
  mutate(
    date=as.numeric(str_sub(index,5,8))
  ) %>%
  group_by(iso3c) %>%
  fill(c('hdi_value'), .direction = 'downup') %>%
  filter(date>=2021-4) %>% #take 5 latest years
  summarise(across(c('hdi_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, hdi_value)

#Yale’s Environmental Performance Index (https://epi.yale.edu/).  Downloaded Aug 24, 2023
epi_2024_df <- read_csv(paste0(raw_dir, 'epi2024results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2024) %>%
  select(iso3c, date, env_perform_index)

epi_2022_df <- read_csv(paste0(raw_dir, 'epi2022results05302022.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2022) %>%
  select(iso3c, date, env_perform_index)

epi_2020_df <- read_csv(paste0(raw_dir, 'epi2020results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2020) %>%
  select(iso3c, date, env_perform_index)

epi_2018_df <- read_csv(paste0(raw_dir, 'epi2018results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.current) %>%
  mutate(date=2018) %>%
  select(iso3c, date, env_perform_index)

epi_df <- bind_rows(epi_2024_df,epi_2022_df, epi_2020_df, epi_2018_df) %>%
  group_by(iso3c) %>%
  fill(c('env_perform_index'), .direction = 'downup') %>%
  filter(date>=2024-4) %>% #take 5 latest years
  summarise(across(c('env_perform_index'), mean, na.rm=TRUE)) %>%
  select(iso3c, env_perform_index)

# FAO food security index and prevalence of undernourishment (https://www.fao.org/faostat/en/#data/FS)
fao_stats_df <- read_csv(paste0(raw_dir, "FAOSTAT_data_en_1-6-2025.csv")) %>%
    mutate(geoAreaCode=as.numeric(`Area Code (M49)`)) %>% #convert to numeric
    left_join(
        read_csv(paste0(raw_dir, "/metadata/iso_codes.csv") ) %>%
            select(geoAreaCode, iso3c)
    ) %>%
    filter(Item %in% c(
        "Prevalence of severe food insecurity in the total population (percent) (3-year average)",
        "Prevalence of undernourishment (percent) (3-year average)"
    )) %>%
    mutate(date=as.numeric(str_sub(`Year Code`,1,4))+1)  %>% #add 1 because it is the midpoint of 3 year avg
    select(iso3c, date, Item, Value) %>% #keep only relevant columns
    filter(!is.na(iso3c)) %>%
    #pivot wider
    pivot_wider(
        names_from=Item,
        values_from=Value,
    ) %>%
    #shorten names
    rename(
        undernourish=`Prevalence of undernourishment (percent) (3-year average)`,
        severe_food_insecurity=`Prevalence of severe food insecurity in the total population (percent) (3-year average)`
    ) %>%
    #keep only numeric values in Value using str_remove_all
    mutate(
        undernourish=as.numeric(str_remove_all(undernourish, "[^0-9.]")),
        severe_food_insecurity=as.numeric(str_remove_all(severe_food_insecurity, "[^0-9.]"))
    ) %>%
  group_by(iso3c) %>%
  fill(c('undernourish','severe_food_insecurity'), .direction = 'downup') %>%
  filter(date>=2023-4) %>% #take 5 latest years
  summarise(across(c('undernourish','severe_food_insecurity'), mean, na.rm=TRUE)) %>%
  select(iso3c, undernourish, severe_food_insecurity) 

##Harvard’s Economic complexity index (https://atlas.cid.harvard.edu/rankings).  Downloaded Aug 24, 2023
ec_complex_df <- read_dta(paste0(raw_dir, "Country Complexity Rankings 1995 - 2022.dta")) %>%
  rename(geoAreaCode=country_id,
         eci_value=hs_eci,
         date=year) %>%
  left_join(iso_codes) %>%
  group_by(iso3c) %>%
  filter(date>=2021-4) %>% #take 5 latest years
  summarise(across(c('eci_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, eci_value)

#World press freedom index (https://rsf.org/en/index).  Downloaded Aug 24, 2023
press_free_df <- bind_rows(
  read_delim(paste0(raw_dir, "press_freedom_2023.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2022.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2021.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2020.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2019.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2018.csv"), delim=";")
) %>%
  mutate(Score=if_else(is.na(Score),`Score N`, Score )) %>%
  mutate(Score=case_when(
    Score<100 ~ Score,
    between(Score,100,999) ~ Score/10,
    TRUE ~ Score/100
  )) %>% #fix issue with comma decimal separator  
  transmute(
    iso3c=ISO,
    press_free_score=Score,
    date=`Year (N)`
  ) %>%
  group_by(iso3c) %>%
  fill(c('press_free_score'), .direction = 'downup') %>%
  filter(date>=2023-4) %>% #take 5 latest years
  summarise(across(c('press_free_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, press_free_score)

#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
bli_df <- read_csv(paste0(raw_dir, 'BLI_20230824.csv')) %>%
  rename(iso3c=LOCATION) %>%
  mutate(
    category=case_when(
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities", "Rooms per person") ~ "Housing",
      Indicator %in% c("Household net wealth", "Household net adjusted disposable income") ~ "Income",
      Indicator %in% c('Labour market insecurity', 'Personal earnings', 'Long-term unemployment rate', 'Employment rate') ~ "Jobs",
      Indicator %in% c('Quality of support network') ~ "Community",
      Indicator %in% c('Years in education','Student skills','Educational attainment') ~ "Education",
      Indicator %in% c('Water quality', "Air pollution") ~ "Environment",
      Indicator %in% c('Stakeholder engagement for developing regulations', 'Voter turnout') ~ "Civic engagement",
      Indicator %in% c('Self-reported health', 'Life expectancy') ~ "Health",
      Indicator %in% c('Life satisfaction') ~ "Life satisfaction",
      Indicator %in% c('Homicide rate', 'Feeling safe walking alone at night') ~ "Safety",
      Indicator %in% c('Time devoted to leisure and personal care','Employees working very long hours') ~ "Work-Life Balance"
      
    )
  ) %>%
  group_by(Indicator) %>%
  mutate(
    score=case_when(#give 1 to a best perfomer and 0 to worst
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities",
                       'Labour market insecurity','Long-term unemployment rate',
                       "Air pollution", 'Homicide rate', 'Employees working very long hours') ~
        1-(Value-min(Value))/(max(Value)-min(Value)),
      TRUE ~ 
        (Value-min(Value))/(max(Value)-min(Value))
    )
  ) %>%
  #zscore all indicators
  mutate(zscore=(Value-mean(Value, na.rm=TRUE))/sd(Value, na.rm=TRUE)) %>%
  group_by(iso3c,category  ) %>%
  #combine indicators with equal weight
  summarise(better_life_subindex=mean(score, na.rm=TRUE)) %>%
  group_by(iso3c) %>%
  summarise(better_life_index=sum(better_life_subindex))

#add legatum prosperity index
# https://www.prosperity.com/
# downloaded on 2024-12-20
legatum_prosperity_df <- read_csv(paste0(raw_dir, 'legatum_prosperity_index.csv')) %>%
  janitor::clean_names() %>%
  transmute(
    iso3c=iso3c,
    date=end_date,
    legatum_health_index=health
  ) %>%
  select(iso3c, legatum_health_index)

# add Global Health Security Index (GHSI) overall score
ghsi_df <- read_csv(paste0(raw_dir, '2021-GHS-Index-April-2022.csv')) %>%
  janitor::clean_names() %>%

  transmute(
    iso3c=iso3c,
    date=year,
    ghsindex=overall_score
  ) %>%
  select(iso3c, date, ghsindex) %>%
  group_by(iso3c) %>%
  filter(date>=2021-4) %>% #take 5 latest years
  summarise(across(c('ghsindex'), mean, na.rm=TRUE)) %>%
  select(iso3c, ghsindex)


#bring it all together
correlates_df_avg <- correlates_df %>%
  left_join(ge_df) %>%
  left_join(gdp_df) %>%
  left_join(hci_df) %>%
  left_join(undernourish_df) %>%
  left_join(mmt_df) %>%
  left_join(lpov_df) %>%
  left_join(wbl_df) %>%
  left_join(safely_df) %>%
  left_join(elect_df) %>%
  left_join(manuf_df) %>%
  left_join(slums_df) %>%
  left_join(ff_sub_df) %>%
  left_join(ghg_df) %>%
  left_join(protected_df) %>%
  left_join(debt_df) %>%
  left_join(sdg_index) %>%
  left_join(hdi_df) %>%
  left_join(epi_df) %>%
  left_join(ec_complex_df) %>%
  left_join(press_free_df) %>%
  left_join(bli_df) %>%       left_join(fao_stats_df) %>%
  left_join(legatum_prosperity_df) %>% left_join(ghsi_df) %>%
  filter(region!="Aggregates") %>%
  select(iso3c, c('SI.POV.DDAY', 'SI.POV.GINI','GE.EST','NY.GDP.PCAP.KD','HD.HCI.OVRL','SN.ITK.DEFC.ZS','SH.STA.MMRT','SE.LPV.PRIM','SG.LAW.INDX','SH.H2O.SMDW.ZS',"EG.ELC.ACCS.ZS",'NV.IND.MANF.ZS','EN.POP.SLUM.UR.ZS','subsidies','EN.GHG.ALL.MT.CE.AR5','ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS','DT.TDS.DECT.EX.ZS', 'sdg_index_score', 'hdi_value', 'env_perform_index', 'eci_value', 'press_free_score', 'better_life_index',                   'undernourish', 'severe_food_insecurity', 'legatum_health_index', 'ghsindex'))


```

```{r comparison5avg, include=FALSE}



#pull SCI values

sci_df <- read_excel(paste0(raw_dir, "Statistical_Capacity_Indicators.xlsx")) %>%
  mutate(YR2004=as.numeric(YR2004),
         iso3c=`Country Code`) %>%
  filter(`Series Code`=="IQ.SCI.OVRL") %>%
  select(-`Country Code`, -`Series Code`) %>%
  pivot_longer(
    cols=c(YR2004:YR2020),
    names_to='YR',
    values_to='SCI'
  ) %>%
  mutate(date=str_remove_all(YR,"YR"),
         date=as.numeric(date)) %>%
  select(iso3c, date, SCI)
  
  
  

# onvert it to a data frame.
sci_df <- sci_df %>%
  left_join(spi_df_empty) %>% #add on country metadata
  filter(!is.na(income)) %>%
  select(iso3c, date, SCI  ) %>%
  group_by(date) %>%
  arrange(-SCI) %>%
  mutate(SCI_rank=rank(-SCI),
         SCI_rank=if_else(is.na(SCI),as.numeric(NA),SCI_rank)) %>%
  group_by(iso3c) %>%
  filter(between(date, 2016,2020)) %>%
  summarise(across(starts_with("SCI"), mean, na.rm=TRUE))


#read in ODIN data
  for (i in c(2015,2016,2017,2018,2020,2022)) {
   temp <- read_csv(paste(raw_dir, '/','ODIN_',i,'.csv', sep=""))  %>%
        as_tibble(.name_repair = 'universal') %>%
        mutate(date=i) %>%
        filter(Data.categories=='All Categories') %>%
        select(-Year)


    assign(paste('odin_df',i,sep="_"), temp)
  }

    #bind different years together
odin_df <- bind_rows(odin_df_2015, odin_df_2016, odin_df_2017, odin_df_2018, odin_df_2020, odin_df_2022)


odin_df <- odin_df %>%
    select(Country.Code, date, Overall.score) %>%
    rename(iso3c=Country.Code,
           ODIN_score=Overall.score) %>%
    group_by(date) %>%
    arrange(-ODIN_score) %>%
    mutate(ODIN_rank=rank(-ODIN_score, na.last='NA')) %>% 
  right_join(spi_df_empty) %>%
  arrange(country, date) %>%
  group_by(country) %>%
  mutate(across(starts_with("ODIN"), na.locf, na.rm=FALSE)) %>%
  left_join(wbstats::wb_countries()) %>%
  group_by(iso3c) %>%
  filter(between(date, 2018,2022)) %>%
  summarise(across(starts_with("ODIN"), mean, na.rm=TRUE))

#read in the ODB
odb_df_2017 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2017-Rankings') %>%
  transmute(
    iso3c=ISO3,
    odb=`ODB-Score`
  )


odb_df_2016_info <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Rankings') %>%
  select(ISO3, Country) 

odb_df_2016 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Absolute-Recalculated') %>%
  left_join(odb_df_2016_info) %>%
  transmute(
    iso3c=ISO3,
    odb_2016=`ODB Score (0-100)`
  )

odb_df_2015 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2015-Rankings')
odb_df_2014 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2014-Rankings')
odb_df_2013 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2013-Rankings')

odb_df <- bind_rows(
  (odb_df_2017 %>% mutate(date=2017)),
  (odb_df_2016 %>% mutate(date=2016, odb=odb_2016)),
  (odb_df_2015 %>% mutate(date=2015, iso3c=ISO3, odb=`ODB-Score-Scaled`)),
  (odb_df_2014 %>% mutate(date=2014, iso3c=ISO3, odb=`ODB-Score-Scaled`)),
  (odb_df_2013 %>% mutate(date=2013, iso3c=ISO3, odb=`ODB-Score-Scaled`)),
  ) %>%
  select(iso3c, date, odb) %>%
  left_join(country_list) %>%
  group_by(iso3c) %>%
  summarise(across(starts_with("odb"), mean, na.rm=TRUE))


#read in the GDB
gdb_df <- read_csv(paste0(raw_dir, "/global-data-barometer-topic--all-countries.csv")) %>%
  transmute(iso3c=iso,
            gdb=`GDB 2021 score`)



#IIAG
iiag_df <- read_csv(paste0(raw_dir, "/2022 IIAG_Scores.csv")) %>%
  transmute(
    iso2c=Country_ISO,
    iiag_overall=`OVERALL GOVERNANCE`,
    iiag_stats=`Capacity of the Statistical System`,
    date=Year
  ) %>%
  left_join(country_list) %>%
  group_by(iso3c) %>%
  filter(between(date, 2018,2022)) %>%
  summarise(across(c('iiag_overall','iiag_stats'), mean, na.rm=TRUE)) %>%
  select(iso3c,iiag_overall,iiag_stats )
  


#create one database
comparison_avg_df <- spi_index_df %>%
  ungroup() %>%
  group_by(iso3c) %>%
  filter(between(date, 2018,2022)) %>%
  summarise(across(starts_with("SPI.INDEX"), mean, na.rm=TRUE)) %>%
  arrange(-SPI.INDEX) %>%
  mutate(across(starts_with('SPI.INDEX'),~1*.),
         across(starts_with('SPI.INDEX'),round,1)) %>%
  select(iso3c, SPI.INDEX) %>%
  left_join(sci_df) %>%
  left_join(odin_df) %>%
  left_join(odb_df) %>%
  left_join(gdb_df) %>%
  left_join(iiag_df) %>%
  left_join(correlates_df_avg) 

#save to output
write_excel_csv(comparison_avg_df, paste0(output_dir, "/SPI_Index_SDG_comparisons_data_5yr_avg.csv"))


```

### Now do a three year average.

```{r correlates3avg}

#instead of the latest value, take the 5 year average

start_date=2010

correlates_df <- wb_data(country="all", 
              indicator=c('SI.POV.DDAY', 'SI.POV.GINI'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  group_by(iso3c, country, region, income, lending) %>%
  fill(c('SI.POV.DDAY', 'SI.POV.GINI'), .direction = 'downup') %>%
  filter(date>=(end_date-2)) %>% #take 5 latest years
  summarise(across(c('SI.POV.DDAY', 'SI.POV.GINI'), mean, na.rm=TRUE))  %>%
  mutate(date=end_date)


ge_df <- wb_data(country="all", 
              indicator=c('GE.EST'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('GE.EST'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('GE.EST'), mean, na.rm=TRUE)) %>%
  select(iso3c, GE.EST)

gdp_df <- wb_data(country="all", 
              indicator=c('NY.GDP.PCAP.KD'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('NY.GDP.PCAP.KD'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('NY.GDP.PCAP.KD'), mean, na.rm=TRUE)) %>%
  select(iso3c, NY.GDP.PCAP.KD)

hci_df <- wb_data(country="all", 
              indicator=c('HD.HCI.OVRL'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('HD.HCI.OVRL'), .direction = 'downup') %>%
   filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('HD.HCI.OVRL'), mean, na.rm=TRUE))  %>%
  select(iso3c, HD.HCI.OVRL)

undernourish_df <- wb_data(country="all", 
              indicator=c('SN.ITK.DEFC.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SN.ITK.DEFC.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('SN.ITK.DEFC.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, SN.ITK.DEFC.ZS)

mmt_df <-  wb_data(country="all", 
              indicator=c('SH.STA.MMRT'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.STA.MMRT'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('SH.STA.MMRT'), mean, na.rm=TRUE)) %>%
  select(iso3c,  SH.STA.MMRT)


lpov_df <-  wb_data(country="all", 
              indicator=c('SE.LPV.PRIM'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SE.LPV.PRIM'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('SE.LPV.PRIM'), mean, na.rm=TRUE))  %>%
  select(iso3c, SE.LPV.PRIM)  


wbl_df <- wb_data(country="all", 
              indicator=c('SG.LAW.INDX'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SG.LAW.INDX'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('SG.LAW.INDX'), mean, na.rm=TRUE)) %>%
  select(iso3c,  SG.LAW.INDX)

safely_df <- wb_data(country="all", 
              indicator=c('SH.H2O.SMDW.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.H2O.SMDW.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('SH.H2O.SMDW.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, SH.H2O.SMDW.ZS)
  
elect_df <- wb_data(country="all", 
              indicator=c('EG.ELC.ACCS.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EG.ELC.ACCS.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('EG.ELC.ACCS.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, EG.ELC.ACCS.ZS)
  
manuf_df <- wb_data(country="all", 
              indicator=c('NV.IND.MANF.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('NV.IND.MANF.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('NV.IND.MANF.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, NV.IND.MANF.ZS)


slums_df <- wb_data(country="all", 
              indicator=c('EN.POP.SLUM.UR.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.POP.SLUM.UR.ZS'), .direction = 'downup') %>%
    filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('EN.POP.SLUM.UR.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c,  EN.POP.SLUM.UR.ZS)

ff_sub_df <- read_csv(paste0(raw_dir, "/Subsidies.csv")) %>%
  rename(iso3c=code)
  

ghg_df <- wb_data(country="all", 
              indicator=c('EN.GHG.ALL.MT.CE.AR5'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.GHG.ALL.MT.CE.AR5'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('EN.GHG.ALL.MT.CE.AR5'), mean, na.rm=TRUE)) %>%
  select(iso3c,  EN.GHG.ALL.MT.CE.AR5)

protected_df <- wb_data(country="all", 
              indicator=c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  fill(c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('ER.MRN.PTMR.ZS', 'ER.LND.PTLD.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, ER.MRN.PTMR.ZS,ER.LND.PTLD.ZS)

debt_df <- wb_data(country="all", 
              indicator=c('DT.TDS.DECT.EX.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  fill(c('DT.TDS.DECT.EX.ZS'), .direction = 'downup') %>%

  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('DT.TDS.DECT.EX.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, DT.TDS.DECT.EX.ZS)

#bring in Sachs SDG index
#https://dashboards.sdgindex.org/explorer
# accessed on 2023-07-20

sdg_index <- read_excel(path=paste0(raw_dir, "/SDR2024-data.xlsx"),
                        sheet='Backdated SDG Index'
) %>%
  transmute(iso3c=id,
         date=year,
         sdg_index_score=`SDG Index Score`) %>%
  group_by(iso3c) %>%
  fill(c('sdg_index_score'), .direction = 'downup') %>%
  filter(date>=end_date-2) %>% #take 5 latest years
  summarise(across(c('sdg_index_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, sdg_index_score)

#hdi
hdi_df <- read_csv(paste0(raw_dir, 'HDR21-22_Composite_indices_complete_time_series.csv')) %>%
  rename(iso3c=iso3) %>%
  select(iso3c, hdi_1990:hdi_2021) %>%
  pivot_longer(
    cols=c('hdi_1990':'hdi_2021'),
    names_to='index',
    values_to='hdi_value'
  ) %>%
  mutate(
    date=as.numeric(str_sub(index,5,8))
  ) %>%
  group_by(iso3c) %>%
  fill(c('hdi_value'), .direction = 'downup') %>%
  filter(date>=2021-2) %>% #take 5 latest years
  summarise(across(c('hdi_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, hdi_value)

#Yale’s Environmental Performance Index (https://epi.yale.edu/).  Downloaded Aug 24, 2023
epi_2024_df <- read_csv(paste0(raw_dir, 'epi2024results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2024) %>%
  select(iso3c, date, env_perform_index)

epi_2022_df <- read_csv(paste0(raw_dir, 'epi2022results05302022.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2022) %>%
  select(iso3c, date, env_perform_index)

epi_2020_df <- read_csv(paste0(raw_dir, 'epi2020results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2020) %>%
  select(iso3c, date, env_perform_index)

epi_2018_df <- read_csv(paste0(raw_dir, 'epi2018results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.current) %>%
  mutate(date=2018) %>%
  select(iso3c, date, env_perform_index)

epi_df <- bind_rows(epi_2024_df,epi_2022_df, epi_2020_df, epi_2018_df) %>%
  group_by(iso3c) %>%
  fill(c('env_perform_index'), .direction = 'downup') %>%
  filter(date>=2024-2) %>% #take 5 latest years
  summarise(across(c('env_perform_index'), mean, na.rm=TRUE)) %>%
  select(iso3c, env_perform_index)

# FAO food security index and prevalence of undernourishment (https://www.fao.org/faostat/en/#data/FS)
fao_stats_df <- read_csv(paste0(raw_dir, "FAOSTAT_data_en_1-6-2025.csv")) %>%
    mutate(geoAreaCode=as.numeric(`Area Code (M49)`)) %>% #convert to numeric
    left_join(
        read_csv(paste0(raw_dir, "/metadata/iso_codes.csv") ) %>%
            select(geoAreaCode, iso3c)
    ) %>%
    filter(Item %in% c(
        "Prevalence of severe food insecurity in the total population (percent) (3-year average)",
        "Prevalence of undernourishment (percent) (3-year average)"
    )) %>%
    mutate(date=as.numeric(str_sub(`Year Code`,1,4))+1)  %>% #add 1 because it is the midpoint of 3 year avg
    select(iso3c, date, Item, Value) %>% #keep only relevant columns
    filter(!is.na(iso3c)) %>%
    #pivot wider
    pivot_wider(
        names_from=Item,
        values_from=Value,
    ) %>%
    #shorten names
    rename(
        undernourish=`Prevalence of undernourishment (percent) (3-year average)`,
        severe_food_insecurity=`Prevalence of severe food insecurity in the total population (percent) (3-year average)`
    ) %>%
    #keep only numeric values in Value using str_remove_all
    mutate(
        undernourish=as.numeric(str_remove_all(undernourish, "[^0-9.]")),
        severe_food_insecurity=as.numeric(str_remove_all(severe_food_insecurity, "[^0-9.]"))
    ) %>%
  group_by(iso3c) %>%
  fill(c('undernourish','severe_food_insecurity'), .direction = 'downup') %>%
  filter(date>=2023-2) %>% #take 5 latest years
  summarise(across(c('undernourish','severe_food_insecurity'), mean, na.rm=TRUE)) %>%
  select(iso3c, undernourish, severe_food_insecurity) 

##Harvard’s Economic complexity index (https://atlas.cid.harvard.edu/rankings).  Downloaded Aug 24, 2023
ec_complex_df <- read_dta(paste0(raw_dir, "Country Complexity Rankings 1995 - 2022.dta")) %>%
  rename(geoAreaCode=country_id,
         eci_value=hs_eci,
         date=year) %>%
  left_join(iso_codes) %>%
  group_by(iso3c) %>%
  filter(date>=2021-2) %>% #take 5 latest years
  summarise(across(c('eci_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, eci_value)

#World press freedom index (https://rsf.org/en/index).  Downloaded Aug 24, 2023
press_free_df <- bind_rows(
  read_delim(paste0(raw_dir, "press_freedom_2023.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2022.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2021.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2020.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2019.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2018.csv"), delim=";")
) %>%
  mutate(Score=if_else(is.na(Score),`Score N`, Score )) %>%
  mutate(Score=case_when(
    Score<100 ~ Score,
    between(Score,100,999) ~ Score/10,
    TRUE ~ Score/100
  )) %>% #fix issue with comma decimal separator  
  transmute(
    iso3c=ISO,
    press_free_score=Score,
    date=`Year (N)`
  ) %>%
  group_by(iso3c) %>%
  fill(c('press_free_score'), .direction = 'downup') %>%
  filter(date>=2023-2) %>% #take 5 latest years
  summarise(across(c('press_free_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, press_free_score)

#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
bli_df <- read_csv(paste0(raw_dir, 'BLI_20230824.csv')) %>%
  rename(iso3c=LOCATION) %>%
  mutate(
    category=case_when(
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities", "Rooms per person") ~ "Housing",
      Indicator %in% c("Household net wealth", "Household net adjusted disposable income") ~ "Income",
      Indicator %in% c('Labour market insecurity', 'Personal earnings', 'Long-term unemployment rate', 'Employment rate') ~ "Jobs",
      Indicator %in% c('Quality of support network') ~ "Community",
      Indicator %in% c('Years in education','Student skills','Educational attainment') ~ "Education",
      Indicator %in% c('Water quality', "Air pollution") ~ "Environment",
      Indicator %in% c('Stakeholder engagement for developing regulations', 'Voter turnout') ~ "Civic engagement",
      Indicator %in% c('Self-reported health', 'Life expectancy') ~ "Health",
      Indicator %in% c('Life satisfaction') ~ "Life satisfaction",
      Indicator %in% c('Homicide rate', 'Feeling safe walking alone at night') ~ "Safety",
      Indicator %in% c('Time devoted to leisure and personal care','Employees working very long hours') ~ "Work-Life Balance"
      
    )
  ) %>%
  group_by(Indicator) %>%
  mutate(
    score=case_when(#give 1 to a best perfomer and 0 to worst
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities",
                       'Labour market insecurity','Long-term unemployment rate',
                       "Air pollution", 'Homicide rate', 'Employees working very long hours') ~
        1-(Value-min(Value))/(max(Value)-min(Value)),
      TRUE ~ 
        (Value-min(Value))/(max(Value)-min(Value))
    )
  ) %>%
  #zscore all indicators
  mutate(zscore=(Value-mean(Value, na.rm=TRUE))/sd(Value, na.rm=TRUE)) %>%
  group_by(iso3c,category  ) %>%
  #combine indicators with equal weight
  summarise(better_life_subindex=mean(score, na.rm=TRUE)) %>%
  group_by(iso3c) %>%
  summarise(better_life_index=sum(better_life_subindex))

#add legatum prosperity index
# https://www.prosperity.com/
# downloaded on 2024-12-20
legatum_prosperity_df <- read_csv(paste0(raw_dir, 'legatum_prosperity_index.csv')) %>%
  janitor::clean_names() %>%
  transmute(
    iso3c=iso3c,
    date=end_date,
    legatum_health_index=health
  ) %>%
  select(iso3c, legatum_health_index)

# add Global Health Security Index (GHSI) overall score
ghsi_df <- read_csv(paste0(raw_dir, '2021-GHS-Index-April-2022.csv')) %>%
  janitor::clean_names() %>%

  transmute(
    iso3c=iso3c,
    date=year,
    ghsindex=overall_score
  ) %>%
  select(iso3c, date, ghsindex) %>%
  group_by(iso3c) %>%
  filter(date>=2021-2) %>% #take 2 latest years
  summarise(across(c('ghsindex'), mean, na.rm=TRUE)) %>%
  select(iso3c, ghsindex)

#bring it all together
correlates_df_avg <- correlates_df %>%
  left_join(ge_df) %>%
  left_join(gdp_df) %>%
  left_join(hci_df) %>%
  left_join(undernourish_df) %>%
  left_join(mmt_df) %>%
  left_join(lpov_df) %>%
  left_join(wbl_df) %>%
  left_join(safely_df) %>%
  left_join(elect_df) %>%
  left_join(manuf_df) %>%
  left_join(slums_df) %>%
  left_join(ff_sub_df) %>%
  left_join(ghg_df) %>%
  left_join(protected_df) %>%
  left_join(debt_df) %>%
  left_join(sdg_index) %>%
  left_join(hdi_df) %>%
  left_join(epi_df) %>%
  left_join(ec_complex_df) %>%
  left_join(press_free_df) %>%
  left_join(bli_df) %>%       left_join(fao_stats_df) %>%
  left_join(legatum_prosperity_df) %>% left_join(ghsi_df) %>%
  filter(region!="Aggregates") %>%
  select(iso3c, c('SI.POV.DDAY', 'SI.POV.GINI','GE.EST','NY.GDP.PCAP.KD','HD.HCI.OVRL','SN.ITK.DEFC.ZS','SH.STA.MMRT','SE.LPV.PRIM','SG.LAW.INDX','SH.H2O.SMDW.ZS',"EG.ELC.ACCS.ZS",'NV.IND.MANF.ZS','EN.POP.SLUM.UR.ZS','subsidies','EN.GHG.ALL.MT.CE.AR5','ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS','DT.TDS.DECT.EX.ZS', 'sdg_index_score', 'hdi_value', 'env_perform_index', 'eci_value', 'press_free_score', 'better_life_index',                   'undernourish', 'severe_food_insecurity', 'legatum_health_index', 'ghsindex'))


```

```{r comparison3avg, include=FALSE}



#pull SCI values

sci_df <- read_excel(paste0(raw_dir, "Statistical_Capacity_Indicators.xlsx")) %>%
  mutate(YR2004=as.numeric(YR2004),
         iso3c=`Country Code`) %>%
  filter(`Series Code`=="IQ.SCI.OVRL") %>%
  select(-`Country Code`, -`Series Code`) %>%
  pivot_longer(
    cols=c(YR2004:YR2020),
    names_to='YR',
    values_to='SCI'
  ) %>%
  mutate(date=str_remove_all(YR,"YR"),
         date=as.numeric(date)) %>%
  select(iso3c, date, SCI)
  
  
  

# onvert it to a data frame.
sci_df <- sci_df %>%
  left_join(spi_df_empty) %>% #add on country metadata
  filter(!is.na(income)) %>%
  select(iso3c, date, SCI  ) %>%
  group_by(date) %>%
  arrange(-SCI) %>%
  mutate(SCI_rank=rank(-SCI),
         SCI_rank=if_else(is.na(SCI),as.numeric(NA),SCI_rank)) %>%
  group_by(iso3c) %>%
  filter(between(date, 2018,2020)) %>%
  summarise(across(starts_with("SCI"), mean, na.rm=TRUE))


#read in ODIN data
  for (i in c(2015,2016,2017,2018,2020,2022)) {
   temp <- read_csv(paste(raw_dir, '/','ODIN_',i,'.csv', sep=""))  %>%
        as_tibble(.name_repair = 'universal') %>%
        mutate(date=i) %>%
        filter(Data.categories=='All Categories') %>%
        select(-Year)


    assign(paste('odin_df',i,sep="_"), temp)
  }

    #bind different years together
odin_df <- bind_rows(odin_df_2015, odin_df_2016, odin_df_2017, odin_df_2018, odin_df_2020, odin_df_2022)


odin_df <- odin_df %>%
    select(Country.Code, date, Overall.score) %>%
    rename(iso3c=Country.Code,
           ODIN_score=Overall.score) %>%
    group_by(date) %>%
    arrange(-ODIN_score) %>%
    mutate(ODIN_rank=rank(-ODIN_score, na.last='NA')) %>% 
  right_join(spi_df_empty) %>%
  arrange(country, date) %>%
  group_by(country) %>%
  mutate(across(starts_with("ODIN"), na.locf, na.rm=FALSE)) %>%
  left_join(wbstats::wb_countries()) %>%
  group_by(iso3c) %>%
  filter(between(date, 2020,2022)) %>%
  summarise(across(starts_with("ODIN"), mean, na.rm=TRUE))

#read in the ODB
odb_df_2017 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2017-Rankings') %>%
  transmute(
    iso3c=ISO3,
    odb=`ODB-Score`
  )


odb_df_2016_info <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Rankings') %>%
  select(ISO3, Country) 

odb_df_2016 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Absolute-Recalculated') %>%
  left_join(odb_df_2016_info) %>%
  transmute(
    iso3c=ISO3,
    odb_2016=`ODB Score (0-100)`
  )

odb_df_2015 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2015-Rankings')
odb_df_2014 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2014-Rankings')
odb_df_2013 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2013-Rankings')

odb_df <- bind_rows(
  (odb_df_2017 %>% mutate(date=2017)),
  (odb_df_2016 %>% mutate(date=2016, odb=odb_2016)),
  (odb_df_2015 %>% mutate(date=2015, iso3c=ISO3, odb=`ODB-Score-Scaled`))
  ) %>%
  select(iso3c, date, odb) %>%
  left_join(country_list) %>%
  group_by(iso3c) %>%
  summarise(across(starts_with("odb"), mean, na.rm=TRUE))


#read in the GDB
gdb_df <- read_csv(paste0(raw_dir, "/global-data-barometer-topic--all-countries.csv")) %>%
  transmute(iso3c=iso,
            gdb=`GDB 2021 score`)



#IIAG
iiag_df <- read_csv(paste0(raw_dir, "/2022 IIAG_Scores.csv")) %>%
  transmute(
    iso2c=Country_ISO,
    iiag_overall=`OVERALL GOVERNANCE`,
    iiag_stats=`Capacity of the Statistical System`,
    date=Year
  ) %>%
  left_join(country_list) %>%
  group_by(iso3c) %>%
  filter(between(date, 2020,2022)) %>%
  summarise(across(c('iiag_overall','iiag_stats'), mean, na.rm=TRUE)) %>%
  select(iso3c,iiag_overall,iiag_stats )
  


#create one database
comparison_avg_df <- spi_index_df %>%
  ungroup() %>%
  group_by(iso3c) %>%
  filter(between(date, 2020,2022)) %>%
  summarise(across(starts_with("SPI.INDEX"), mean, na.rm=TRUE)) %>%
  arrange(-SPI.INDEX) %>%
  mutate(across(starts_with('SPI.INDEX'),~1*.),
         across(starts_with('SPI.INDEX'),round,1)) %>%
  select(iso3c, SPI.INDEX) %>%
  left_join(sci_df) %>%
  left_join(odin_df) %>%
  left_join(odb_df) %>%
  left_join(gdb_df) %>%
  left_join(iiag_df) %>%
  left_join(correlates_df_avg) 

#save to output
write_excel_csv(comparison_avg_df, paste0(output_dir, "/SPI_Index_SDG_comparisons_data_3yr_avg.csv"))


```

### Two year average

```{r correlates2avg}

#instead of the latest value, take the 5 year average

start_date=2010

correlates_df <- wb_data(country="all", 
              indicator=c('SI.POV.DDAY', 'SI.POV.GINI'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(country) %>%
  right_join(spi_df_empty) %>%
  group_by(iso3c, country, region, income, lending) %>%
  fill(c('SI.POV.DDAY', 'SI.POV.GINI'), .direction = 'downup') %>%
  filter(date>=(end_date-1)) %>% #take 5 latest years
  summarise(across(c('SI.POV.DDAY', 'SI.POV.GINI'), mean, na.rm=TRUE))  %>%
  mutate(date=end_date)


ge_df <- wb_data(country="all", 
              indicator=c('GE.EST'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('GE.EST'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('GE.EST'), mean, na.rm=TRUE)) %>%
  select(iso3c, GE.EST)

gdp_df <- wb_data(country="all", 
              indicator=c('NY.GDP.PCAP.KD'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('NY.GDP.PCAP.KD'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('NY.GDP.PCAP.KD'), mean, na.rm=TRUE)) %>%
  select(iso3c, NY.GDP.PCAP.KD)

hci_df <- wb_data(country="all", 
              indicator=c('HD.HCI.OVRL'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('HD.HCI.OVRL'), .direction = 'downup') %>%
   filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('HD.HCI.OVRL'), mean, na.rm=TRUE))  %>%
  select(iso3c, HD.HCI.OVRL)

undernourish_df <- wb_data(country="all", 
              indicator=c('SN.ITK.DEFC.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SN.ITK.DEFC.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('SN.ITK.DEFC.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, SN.ITK.DEFC.ZS)

mmt_df <-  wb_data(country="all", 
              indicator=c('SH.STA.MMRT'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.STA.MMRT'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('SH.STA.MMRT'), mean, na.rm=TRUE)) %>%
  select(iso3c,  SH.STA.MMRT)


lpov_df <-  wb_data(country="all", 
              indicator=c('SE.LPV.PRIM'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SE.LPV.PRIM'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('SE.LPV.PRIM'), mean, na.rm=TRUE))  %>%
  select(iso3c, SE.LPV.PRIM)  


wbl_df <- wb_data(country="all", 
              indicator=c('SG.LAW.INDX'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SG.LAW.INDX'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('SG.LAW.INDX'), mean, na.rm=TRUE)) %>%
  select(iso3c,  SG.LAW.INDX)

safely_df <- wb_data(country="all", 
              indicator=c('SH.H2O.SMDW.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('SH.H2O.SMDW.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('SH.H2O.SMDW.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, SH.H2O.SMDW.ZS)
  
elect_df <- wb_data(country="all", 
              indicator=c('EG.ELC.ACCS.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EG.ELC.ACCS.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('EG.ELC.ACCS.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, EG.ELC.ACCS.ZS)
  
manuf_df <- wb_data(country="all", 
              indicator=c('NV.IND.MANF.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('NV.IND.MANF.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('NV.IND.MANF.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, NV.IND.MANF.ZS)


slums_df <- wb_data(country="all", 
              indicator=c('EN.POP.SLUM.UR.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.POP.SLUM.UR.ZS'), .direction = 'downup') %>%
    filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('EN.POP.SLUM.UR.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c,  EN.POP.SLUM.UR.ZS)

ff_sub_df <- read_csv(paste0(raw_dir, "/Subsidies.csv")) %>%
  rename(iso3c=code)
  

ghg_df <- wb_data(country="all", 
              indicator=c('EN.GHG.ALL.MT.CE.AR5'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  right_join(spi_df_empty) %>%
  fill(c('EN.GHG.ALL.MT.CE.AR5'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('EN.GHG.ALL.MT.CE.AR5'), mean, na.rm=TRUE)) %>%
  select(iso3c,  EN.GHG.ALL.MT.CE.AR5)

protected_df <- wb_data(country="all", 
              indicator=c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  fill(c('ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS'), .direction = 'downup') %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('ER.MRN.PTMR.ZS', 'ER.LND.PTLD.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, ER.MRN.PTMR.ZS,ER.LND.PTLD.ZS)

debt_df <- wb_data(country="all", 
              indicator=c('DT.TDS.DECT.EX.ZS'),
              start_date=start_date,
              end_date=end_date,
              ) %>% # fill forward 
  group_by(iso3c) %>%
  fill(c('DT.TDS.DECT.EX.ZS'), .direction = 'downup') %>%

  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('DT.TDS.DECT.EX.ZS'), mean, na.rm=TRUE)) %>%
  select(iso3c, DT.TDS.DECT.EX.ZS)

#bring in Sachs SDG index
#https://dashboards.sdgindex.org/explorer
# accessed on 2023-07-20

sdg_index <- read_excel(path=paste0(raw_dir, "/SDR2024-data.xlsx"),
                        sheet='Backdated SDG Index'
) %>%
  transmute(iso3c=id,
         date=year,
         sdg_index_score=`SDG Index Score`) %>%
  group_by(iso3c) %>%
  filter(date>=end_date-1) %>% #take 5 latest years
  summarise(across(c('sdg_index_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, sdg_index_score)


#hdi
hdi_df <- read_csv(paste0(raw_dir, 'HDR21-22_Composite_indices_complete_time_series.csv')) %>%
  rename(iso3c=iso3) %>%
  select(iso3c, hdi_1990:hdi_2021) %>%
  pivot_longer(
    cols=c('hdi_1990':'hdi_2021'),
    names_to='index',
    values_to='hdi_value'
  ) %>%
  mutate(
    date=as.numeric(str_sub(index,5,8))
  ) %>%
  group_by(iso3c) %>%
  fill(c('hdi_value'), .direction = 'downup') %>%
  filter(date>=2021-1) %>% #take 5 latest years
  summarise(across(c('hdi_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, hdi_value)

#Yale’s Environmental Performance Index (https://epi.yale.edu/).  Downloaded Aug 24, 2023
epi_2024_df <- read_csv(paste0(raw_dir, 'epi2024results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2024) %>%
  select(iso3c, date, env_perform_index)

epi_2022_df <- read_csv(paste0(raw_dir, 'epi2022results05302022.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2022) %>%
  select(iso3c, date, env_perform_index)

epi_2020_df <- read_csv(paste0(raw_dir, 'epi2020results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2020) %>%
  select(iso3c, date, env_perform_index)

epi_2018_df <- read_csv(paste0(raw_dir, 'epi2018results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.current) %>%
  mutate(date=2018) %>%
  select(iso3c, date, env_perform_index)

epi_df <- bind_rows(epi_2024_df,epi_2022_df, epi_2020_df, epi_2018_df) %>%
  group_by(iso3c) %>%
  fill(c('env_perform_index'), .direction = 'downup') %>%
  filter(date>=2024-1) %>% #take 5 latest years
  summarise(across(c('env_perform_index'), mean, na.rm=TRUE)) %>%
  select(iso3c, env_perform_index)

# FAO food security index and prevalence of undernourishment (https://www.fao.org/faostat/en/#data/FS)
fao_stats_df <- read_csv(paste0(raw_dir, "FAOSTAT_data_en_1-6-2025.csv")) %>%
    mutate(geoAreaCode=as.numeric(`Area Code (M49)`)) %>% #convert to numeric
    left_join(
        read_csv(paste0(raw_dir, "/metadata/iso_codes.csv") ) %>%
            select(geoAreaCode, iso3c)
    ) %>%
    filter(Item %in% c(
        "Prevalence of severe food insecurity in the total population (percent) (3-year average)",
        "Prevalence of undernourishment (percent) (3-year average)"
    )) %>%
    mutate(date=as.numeric(str_sub(`Year Code`,1,4))+1)  %>% #add 1 because it is the midpoint of 3 year avg
    select(iso3c, date, Item, Value) %>% #keep only relevant columns
    filter(!is.na(iso3c)) %>%
    #pivot wider
    pivot_wider(
        names_from=Item,
        values_from=Value,
    ) %>%
    #shorten names
    rename(
        undernourish=`Prevalence of undernourishment (percent) (3-year average)`,
        severe_food_insecurity=`Prevalence of severe food insecurity in the total population (percent) (3-year average)`
    ) %>%
    #keep only numeric values in Value using str_remove_all
    mutate(
        undernourish=as.numeric(str_remove_all(undernourish, "[^0-9.]")),
        severe_food_insecurity=as.numeric(str_remove_all(severe_food_insecurity, "[^0-9.]"))
    ) %>%
  group_by(iso3c) %>%
  fill(c('undernourish','severe_food_insecurity'), .direction = 'downup') %>%
  filter(date>=2022-1) %>% #take 5 latest years
  summarise(across(c('undernourish','severe_food_insecurity'), mean, na.rm=TRUE)) %>%
  select(iso3c, undernourish, severe_food_insecurity) 

##Harvard’s Economic complexity index (https://atlas.cid.harvard.edu/rankings).  Downloaded Aug 24, 2023
ec_complex_df <- read_dta(paste0(raw_dir, "Country Complexity Rankings 1995 - 2022.dta")) %>%
  rename(geoAreaCode=country_id,
         eci_value=hs_eci,
         date=year) %>%
  left_join(iso_codes) %>%
  group_by(iso3c) %>%
  filter(date>=2021-1) %>% #take 5 latest years
  summarise(across(c('eci_value'), mean, na.rm=TRUE)) %>%
  select(iso3c, eci_value)

#World press freedom index (https://rsf.org/en/index).  Downloaded Aug 24, 2023
press_free_df <- bind_rows(
  read_delim(paste0(raw_dir, "press_freedom_2023.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2022.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2021.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2020.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2019.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2018.csv"), delim=";")
) %>%
  mutate(Score=if_else(is.na(Score),`Score N`, Score )) %>%
  mutate(Score=case_when(
    Score<100 ~ Score,
    between(Score,100,999) ~ Score/10,
    TRUE ~ Score/100
  )) %>% #fix issue with comma decimal separator  
  transmute(
    iso3c=ISO,
    press_free_score=Score,
    date=`Year (N)`
  ) %>%
  group_by(iso3c) %>%
  fill(c('press_free_score'), .direction = 'downup') %>%
  filter(date>=2023-1) %>% #take 5 latest years
  summarise(across(c('press_free_score'), mean, na.rm=TRUE)) %>%
  select(iso3c, press_free_score)

#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
bli_df <- read_csv(paste0(raw_dir, 'BLI_20230824.csv')) %>%
  rename(iso3c=LOCATION) %>%
  mutate(
    category=case_when(
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities", "Rooms per person") ~ "Housing",
      Indicator %in% c("Household net wealth", "Household net adjusted disposable income") ~ "Income",
      Indicator %in% c('Labour market insecurity', 'Personal earnings', 'Long-term unemployment rate', 'Employment rate') ~ "Jobs",
      Indicator %in% c('Quality of support network') ~ "Community",
      Indicator %in% c('Years in education','Student skills','Educational attainment') ~ "Education",
      Indicator %in% c('Water quality', "Air pollution") ~ "Environment",
      Indicator %in% c('Stakeholder engagement for developing regulations', 'Voter turnout') ~ "Civic engagement",
      Indicator %in% c('Self-reported health', 'Life expectancy') ~ "Health",
      Indicator %in% c('Life satisfaction') ~ "Life satisfaction",
      Indicator %in% c('Homicide rate', 'Feeling safe walking alone at night') ~ "Safety",
      Indicator %in% c('Time devoted to leisure and personal care','Employees working very long hours') ~ "Work-Life Balance"
      
    )
  ) %>%
  group_by(Indicator) %>%
  mutate(
    score=case_when(#give 1 to a best perfomer and 0 to worst
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities",
                       'Labour market insecurity','Long-term unemployment rate',
                       "Air pollution", 'Homicide rate', 'Employees working very long hours') ~
        1-(Value-min(Value))/(max(Value)-min(Value)),
      TRUE ~ 
        (Value-min(Value))/(max(Value)-min(Value))
    )
  ) %>%
  #zscore all indicators
  mutate(zscore=(Value-mean(Value, na.rm=TRUE))/sd(Value, na.rm=TRUE)) %>%
  group_by(iso3c,category  ) %>%
  #combine indicators with equal weight
  summarise(better_life_subindex=mean(score, na.rm=TRUE)) %>%
  group_by(iso3c) %>%
  summarise(better_life_index=sum(better_life_subindex))

#add legatum prosperity index
# https://www.prosperity.com/
# downloaded on 2024-12-20
legatum_prosperity_df <- read_csv(paste0(raw_dir, 'legatum_prosperity_index.csv')) %>%
  janitor::clean_names() %>%
  transmute(
    iso3c=iso3c,
    date=end_date,
    legatum_health_index=health
  ) %>%
  select(iso3c, legatum_health_index)

# add Global Health Security Index (GHSI) overall score
ghsi_df <- read_csv(paste0(raw_dir, '2021-GHS-Index-April-2022.csv')) %>%
  janitor::clean_names() %>%

  transmute(
    iso3c=iso3c,
    date=year,
    ghsindex=overall_score
  ) %>%
  select(iso3c, date, ghsindex) %>%
  group_by(iso3c) %>%
  filter(date>=2021-1) %>% #take 1 latest years
  summarise(across(c('ghsindex'), mean, na.rm=TRUE)) %>%
  select(iso3c, ghsindex)

#bring it all together
correlates_df_avg <- correlates_df %>%
  left_join(ge_df) %>%
  left_join(gdp_df) %>%
  left_join(hci_df) %>%
  left_join(undernourish_df) %>%
  left_join(mmt_df) %>%
  left_join(lpov_df) %>%
  left_join(wbl_df) %>%
  left_join(safely_df) %>%
  left_join(elect_df) %>%
  left_join(manuf_df) %>%
  left_join(slums_df) %>%
  left_join(ff_sub_df) %>%
  left_join(ghg_df) %>%
  left_join(protected_df) %>%
  left_join(debt_df) %>%
  left_join(sdg_index) %>%
  left_join(hdi_df) %>%
  left_join(epi_df) %>%
  left_join(ec_complex_df) %>%
  left_join(press_free_df) %>%
  left_join(bli_df) %>%       left_join(fao_stats_df) %>%
  left_join(legatum_prosperity_df) %>% left_join(ghsi_df) %>%
  filter(region!="Aggregates") %>%
  select(iso3c, c('SI.POV.DDAY', 'SI.POV.GINI','GE.EST','NY.GDP.PCAP.KD','HD.HCI.OVRL','SN.ITK.DEFC.ZS','SH.STA.MMRT','SE.LPV.PRIM','SG.LAW.INDX','SH.H2O.SMDW.ZS',"EG.ELC.ACCS.ZS",'NV.IND.MANF.ZS','EN.POP.SLUM.UR.ZS','subsidies','EN.GHG.ALL.MT.CE.AR5','ER.MRN.PTMR.ZS','ER.LND.PTLD.ZS','DT.TDS.DECT.EX.ZS', 'sdg_index_score', 'hdi_value', 'env_perform_index', 'eci_value', 'press_free_score', 'better_life_index',                   'undernourish', 'severe_food_insecurity', 'legatum_health_index', 'ghsindex'))


```

```{r comparison2avg, include=FALSE}



#pull SCI values

sci_df <- read_excel(paste0(raw_dir, "Statistical_Capacity_Indicators.xlsx")) %>%
  mutate(YR2004=as.numeric(YR2004),
         iso3c=`Country Code`) %>%
  filter(`Series Code`=="IQ.SCI.OVRL") %>%
  select(-`Country Code`, -`Series Code`) %>%
  pivot_longer(
    cols=c(YR2004:YR2020),
    names_to='YR',
    values_to='SCI'
  ) %>%
  mutate(date=str_remove_all(YR,"YR"),
         date=as.numeric(date)) %>%
  select(iso3c, date, SCI)
  
  
  

# onvert it to a data frame.
sci_df <- sci_df %>%
  left_join(spi_df_empty) %>% #add on country metadata
  filter(!is.na(income)) %>%
  select(iso3c, date, SCI  ) %>%
  group_by(date) %>%
  arrange(-SCI) %>%
  mutate(SCI_rank=rank(-SCI),
         SCI_rank=if_else(is.na(SCI),as.numeric(NA),SCI_rank)) %>%
  group_by(iso3c) %>%
  filter(between(date, 2019,2020)) %>%
  summarise(across(starts_with("SCI"), mean, na.rm=TRUE))


#read in ODIN data
  for (i in c(2015,2016,2017,2018,2020,2022)) {
   temp <- read_csv(paste(raw_dir, '/','ODIN_',i,'.csv', sep=""))  %>%
        as_tibble(.name_repair = 'universal') %>%
        mutate(date=i) %>%
        filter(Data.categories=='All Categories') %>%
        select(-Year)


    assign(paste('odin_df',i,sep="_"), temp)
  }

    #bind different years together
odin_df <- bind_rows(odin_df_2015, odin_df_2016, odin_df_2017, odin_df_2018, odin_df_2020, odin_df_2022)


odin_df <- odin_df %>%
    select(Country.Code, date, Overall.score) %>%
    rename(iso3c=Country.Code,
           ODIN_score=Overall.score) %>%
    group_by(date) %>%
    arrange(-ODIN_score) %>%
    mutate(ODIN_rank=rank(-ODIN_score, na.last='NA')) %>% 
  right_join(spi_df_empty) %>%
  arrange(country, date) %>%
  group_by(country) %>%
  mutate(across(starts_with("ODIN"), na.locf, na.rm=FALSE)) %>%
  left_join(wbstats::wb_countries()) %>%
  group_by(iso3c) %>%
  filter(between(date, 2021,2022)) %>%
  summarise(across(starts_with("ODIN"), mean, na.rm=TRUE))

#read in the ODB
odb_df_2017 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2017-Rankings') %>%
  transmute(
    iso3c=ISO3,
    odb=`ODB-Score`
  )


odb_df_2016_info <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Rankings') %>%
  select(ISO3, Country) 

odb_df_2016 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2016-Absolute-Recalculated') %>%
  left_join(odb_df_2016_info) %>%
  transmute(
    iso3c=ISO3,
    odb_2016=`ODB Score (0-100)`
  )

odb_df_2015 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2015-Rankings')
odb_df_2014 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2014-Rankings')
odb_df_2013 <- read_excel(path=paste0(raw_dir, '/Open Data Barometer - Historical Data (All five Editions) - Public.xlsx'),
                     sheet='ODB-2013-Rankings')

odb_df <- bind_rows(
  (odb_df_2017 %>% mutate(date=2017)),
  (odb_df_2016 %>% mutate(date=2016, odb=odb_2016)),
  ) %>%
  select(iso3c, date, odb) %>%
  left_join(country_list) %>%
  group_by(iso3c) %>%
  summarise(across(starts_with("odb"), mean, na.rm=TRUE))


#read in the GDB
gdb_df <- read_csv(paste0(raw_dir, "/global-data-barometer-topic--all-countries.csv")) %>%
  transmute(iso3c=iso,
            gdb=`GDB 2021 score`)



#IIAG
iiag_df <- read_csv(paste0(raw_dir, "/2022 IIAG_Scores.csv")) %>%
  transmute(
    iso2c=Country_ISO,
    iiag_overall=`OVERALL GOVERNANCE`,
    iiag_stats=`Capacity of the Statistical System`,
    date=Year
  ) %>%
  left_join(country_list) %>%
  group_by(iso3c) %>%
  filter(between(date, 2021,2022)) %>%
  summarise(across(c('iiag_overall','iiag_stats'), mean, na.rm=TRUE)) %>%
  select(iso3c,iiag_overall,iiag_stats )
  


#create one database
comparison_avg_df <- spi_index_df %>%
  ungroup() %>%
  group_by(iso3c) %>%
  filter(between(date, 2021,2022)) %>%
  summarise(across(starts_with("SPI.INDEX"), mean, na.rm=TRUE)) %>%
  arrange(-SPI.INDEX) %>%
  mutate(across(starts_with('SPI.INDEX'),~1*.),
         across(starts_with('SPI.INDEX'),round,1)) %>%
  select(iso3c, SPI.INDEX) %>%
  left_join(sci_df) %>%
  left_join(odin_df) %>%
  left_join(odb_df) %>%
  left_join(gdb_df) %>%
  left_join(iiag_df) %>%
  left_join(correlates_df_avg) 

#save to output
write_excel_csv(comparison_avg_df, paste0(output_dir, "/SPI_Index_SDG_comparisons_data_2yr_avg.csv"))


```

## Regression Predictors 

```{r regdf, echo=FALSE}


sdg_index <- read_excel(path=paste0(raw_dir, "/SDR2024-data.xlsx"),
                        sheet='Backdated SDG Index'
) %>%
  transmute(iso3c=id,
         date=year,
         sdg_index_score=`SDG Index Score`) 


#hdi
hdi_df <- read_csv(paste0(raw_dir, 'HDR21-22_Composite_indices_complete_time_series.csv')) %>%
  rename(iso3c=iso3) %>%
  select(iso3c, hdi_1990:hdi_2021) %>%
  pivot_longer(
    cols=c('hdi_1990':'hdi_2021'),
    names_to='index',
    values_to='hdi_value'
  ) %>%
  mutate(
    date=as.numeric(str_sub(index,5,8))
  ) %>%
  select(iso3c, date, hdi_value)


#Yale’s Environmental Performance Index (https://epi.yale.edu/).  Downloaded Aug 24, 2023
epi_2024_df <- read_csv(paste0(raw_dir, 'epi2024results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2024) %>%
  select(iso3c, date, env_perform_index)

epi_2022_df <- read_csv(paste0(raw_dir, 'epi2022results05302022.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2022) %>%
  select(iso3c, date, env_perform_index)

epi_2020_df <- read_csv(paste0(raw_dir, 'epi2020results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.new) %>%
  mutate(date=2020) %>%
  select(iso3c, date, env_perform_index)

epi_2018_df <- read_csv(paste0(raw_dir, 'epi2018results.csv')) %>%
  rename(iso3c=iso,
         env_perform_index=EPI.current) %>%
  mutate(date=2018) %>%
  select(iso3c, date, env_perform_index)

epi_df <- bind_rows(epi_2024_df,epi_2022_df, epi_2020_df, epi_2018_df) %>%
  select(iso3c, date, env_perform_index)

# FAO food security index and prevalence of undernourishment (https://www.fao.org/faostat/en/#data/FS)
fao_stats_df <- read_csv(paste0(raw_dir, "FAOSTAT_data_en_1-6-2025.csv")) %>%
    mutate(geoAreaCode=as.numeric(`Area Code (M49)`)) %>% #convert to numeric
    left_join(
        read_csv(paste0(raw_dir, "/metadata/iso_codes.csv") ) %>%
            select(geoAreaCode, iso3c)
    ) %>%
    filter(Item %in% c(
        "Prevalence of severe food insecurity in the total population (percent) (3-year average)",
        "Prevalence of undernourishment (percent) (3-year average)"
    )) %>%
    mutate(date=as.numeric(str_sub(`Year Code`,1,4))+1)  %>% #add 1 because it is the midpoint of 3 year avg
    select(iso3c, date, Item, Value) %>% #keep only relevant columns
    filter(!is.na(iso3c)) %>%
    #pivot wider
    pivot_wider(
        names_from=Item,
        values_from=Value,
    ) %>%
    #shorten names
    rename(
        undernourish=`Prevalence of undernourishment (percent) (3-year average)`,
        severe_food_insecurity=`Prevalence of severe food insecurity in the total population (percent) (3-year average)`
    ) %>%
    #keep only numeric values in Value using str_remove_all
    mutate(
        undernourish=as.numeric(str_remove_all(undernourish, "[^0-9.]")),
        severe_food_insecurity=as.numeric(str_remove_all(severe_food_insecurity, "[^0-9.]"))
    ) 

##Harvard’s Economic complexity index (https://atlas.cid.harvard.edu/rankings).  Downloaded Aug 24, 2023
ec_complex_df <- read_dta(paste0(raw_dir, "Country Complexity Rankings 1995 - 2022.dta")) %>%
  rename(geoAreaCode=country_id,
         eci_value=hs_eci,
         date=year) %>%
  left_join(iso_codes) %>%
  group_by(iso3c) %>%
  select(iso3c, date, eci_value)

#World press freedom index (https://rsf.org/en/index).  Downloaded Aug 24, 2023
press_free_df <- bind_rows(
  read_delim(paste0(raw_dir, "press_freedom_2023.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2022.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2021.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2020.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2019.csv"), delim=";"),
  read_delim(paste0(raw_dir, "press_freedom_2018.csv"), delim=";")
) %>%
  mutate(Score=if_else(is.na(Score),`Score N`, Score )) %>%
  mutate(Score=case_when(
    Score<100 ~ Score,
    between(Score,100,999) ~ Score/10,
    TRUE ~ Score/100
  )) %>% #fix issue with comma decimal separator
  transmute(
    iso3c=ISO,
    press_free_score=Score,
    date=`Year (N)`
  ) 

#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
#OECD’s better life index (https://www.oecdbetterlifeindex.org/#/11111111111).  Downloaded Aug 24, 2023.  Only one year of data as not comparable over time.
bli_df <- read_csv(paste0(raw_dir, 'BLI_20230824.csv')) %>% mutate(date=2018) %>%
  bind_rows(read_csv(paste0(raw_dir, 'BLI_2017_20230824.csv')) %>% mutate(date=2017)) %>%
  bind_rows(read_csv(paste0(raw_dir, 'BLI_2016_20230824.csv')) %>% mutate(date=2016)) %>%  bind_rows(read_csv(paste0(raw_dir, 'BLI_2015_20230824.csv')) %>% mutate(date=2015)) %>%  bind_rows(read_csv(paste0(raw_dir, 'BLI_2014_20230824.csv')) %>% mutate(date=2014)) %>%  bind_rows(read_csv(paste0(raw_dir, 'BLI_2013_20230824.csv')) %>% mutate(date=2013)) %>%  rename(iso3c=LOCATION) %>%
  mutate(
    category=case_when(
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities", "Rooms per person") ~ "Housing",
      Indicator %in% c("Household net wealth", "Household net adjusted disposable income") ~ "Income",
      Indicator %in% c('Labour market insecurity', 'Personal earnings', 'Long-term unemployment rate', 'Employment rate') ~ "Jobs",
      Indicator %in% c('Quality of support network') ~ "Community",
      Indicator %in% c('Years in education','Student skills','Educational attainment') ~ "Education",
      Indicator %in% c('Water quality', "Air pollution") ~ "Environment",
      Indicator %in% c('Stakeholder engagement for developing regulations', 'Voter turnout') ~ "Civic engagement",
      Indicator %in% c('Self-reported health', 'Life expectancy') ~ "Health",
      Indicator %in% c('Life satisfaction') ~ "Life satisfaction",
      Indicator %in% c('Homicide rate', 'Feeling safe walking alone at night') ~ "Safety",
      Indicator %in% c('Time devoted to leisure and personal care','Employees working very long hours') ~ "Work-Life Balance"
      
    )
  ) %>%
  group_by(Indicator, date) %>%
  mutate(
    score=case_when(#give 1 to a best perfomer and 0 to worst
      Indicator %in% c("Housing expenditure", "Dwellings without basic facilities",
                       'Labour market insecurity','Long-term unemployment rate',
                       "Air pollution", 'Homicide rate', 'Employees working very long hours') ~
        1-(Value-min(Value))/(max(Value)-min(Value)),
      TRUE ~ 
        (Value-min(Value))/(max(Value)-min(Value))
    )
  ) %>%
  #zscore all indicators
  mutate(zscore=(Value-mean(Value, na.rm=TRUE))/sd(Value, na.rm=TRUE)) %>%
  group_by(iso3c,category, date  ) %>%
  #combine indicators with equal weight
  summarise(better_life_subindex=mean(score, na.rm=TRUE)) %>%
  group_by(iso3c, date) %>%
  summarise(better_life_index=sum(better_life_subindex)) 


#add legatum prosperity index
# https://www.prosperity.com/
# downloaded on 2024-12-20
legatum_prosperity_df <- read_csv(paste0(raw_dir, 'legatum_prosperity_index.csv')) %>%
  janitor::clean_names() %>%
  transmute(
    iso3c=iso3c,
    date=end_date,
    legatum_health_index=health
  ) %>%
  select(iso3c, date, legatum_health_index)

# add Global Health Security Index (GHSI) overall score
ghsi_df <- read_csv(paste0(raw_dir, '2021-GHS-Index-April-2022.csv')) %>%
  janitor::clean_names() %>%

  transmute(
    iso3c=iso3c,
    date=year,
    ghsindex=overall_score
  ) %>%
  select(iso3c, date, ghsindex)

predictors_df <- wbstats::wb_data(country="countries_only",
             indicator=c('NY.GDP.PCAP.KD', 'NV.IND.MANF.ZS', 'NV.AGR.TOTL.ZS','NE.TRD.GNFS.ZS', 'HD.HCI.OVRL','SE.PRM.ENRR','BN.CAB.XOKA.GD.ZS', 'CC.EST', "GE.EST", 'PV.EST', "RQ.EST", "RL.EST", "VA.EST", "BX.KLT.DINV.WD.GD.ZS", "SI.POV.DDAY", "SI.POV.GINI"),
             start_date=2010,
             end_date=2023) %>%
  mutate(date=as.numeric(date)) %>%
  left_join(sdg_index) %>%
  left_join(hdi_df) %>%
  left_join(epi_df) %>%
  left_join(ec_complex_df) %>%
  left_join(press_free_df) %>%
  left_join(bli_df) %>%       left_join(fao_stats_df) %>%
  left_join(legatum_prosperity_df) %>% left_join(ghsi_df) %>%
  arrange(iso3c, date) %>%
  group_by(iso3c) %>%
  fill(c( 'NY.GDP.PCAP.KD','NV.IND.MANF.ZS', 'NV.AGR.TOTL.ZS', 'SE.PRM.ENRR', 'HD.HCI.OVRL','NE.TRD.GNFS.ZS','BN.CAB.XOKA.GD.ZS', 'CC.EST', "GE.EST", 'PV.EST', "RQ.EST", "RL.EST", "VA.EST", "BX.KLT.DINV.WD.GD.ZS","SI.POV.DDAY", "SI.POV.GINI",
          'sdg_index_score', 'hdi_value', 'env_perform_index', 'eci_value', 
          'press_free_score', 'better_life_index',                   'undernourish', 'severe_food_insecurity', 'legatum_health_index', 'ghsindex'), .direction="downup") %>%
  mutate(WGI.OVL=(CC.EST + GE.EST + PV.EST + RQ.EST + RL.EST + VA.EST)/6,
         lag_outcome=lag(NY.GDP.PCAP.KD),
         lag_fdi=lag(BX.KLT.DINV.WD.GD.ZS),
         lag_ge=lag(GE.EST),
         lag_poverty=lag(SI.POV.DDAY),
         lag_gini=lag(SI.POV.GINI)
         )  %>%
  ungroup()

#save to outputs
write_excel_csv(predictors_df, paste0(output_dir, "/SPI_regression_predictors.csv"))


```

Create a markdown with the descriptions of the variables in people friendly format

| Variable | Short Description |
|----------|-------------|
| SPI.INDEX | Statistical Performance Indicator (SPI) overall score |
| SCI | Statistical Capacity Indicator (SCI) overall score |
| ODIN | Open Data Index (ODIN) overall score |
| ODB | Open Data Barometer (ODB) overall score |
| GDB | Global Data Barometer (GDB) overall score |
| IIAG | Ibrahim Index of African Governance (IIAG) overall score |
| NY.GDP.PCAP.KD | GDP per capita (constant 2015 US$) |
| NV.IND.MANF.ZS | Manufacturing, value added (% of GDP) |
| NV.AGR.TOTL.ZS | Agriculture, forestry, and fishing, value added (% of GDP) |
| NE.TRD.GNFS.ZS | Trade (% of GDP) |
| HD.HCI.OVRL | Human Capital Index (HCI) overall score |
| SE.PRM.ENRR | School enrollment, primary (% gross) |
| BN.CAB.XOKA.GD.ZS | Current account balance (% of GDP) |
| CC.EST | Control of Corruption |
| GE.EST | Government Effectiveness |
| PV.EST | Political Stability and Absence of Violence |
| RQ.EST | Regulatory Quality |
| RL.EST | Rule of Law  |
| VA.EST | Voice and Accountability |
| BX.KLT.DINV.WD.GD.ZS | Foreign direct investment, net inflows (% of GDP) |
| SI.POV.DDAY | Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population) |













